Fijamos la semilla para poder reproducir los experimentos. También se fija el numero de CPU’s a utilizar.
set.seed(42)
options(mc.cores = 24)Se importan las librerÃas a utilizar a lo largo de la notebook:
# install.packages(pacman)
# install.packages("https://cran.r-project.org/src/contrib/rstan_2.21.2.tar.gz",repos = NULL,type="source")
# sudo apt-get install libglpk-devlibrary(pacman)
p_load(tidyverse, tidymodels, rsample, rstan, shinystan, rstanarm, devtools)
p_load_gh('adrianmarino/commons')
import('../src/dataset.R')## [1] "-> '../src/dataset.R' script loadded successfuly!"
import('../src/plot.R')## [1] "-> '../src/plot.R' script loadded successfuly!"
import('../src/model.R')## [1] "-> './bayesian_regression_predictor.R' script loadded successfuly!"
## [1] "-> '../src/model.R' script loadded successfuly!"
Palmer Penguins
dataset <- load_dataset() %>% mutate_if(is.character, as.factor)
dataset %>% glimpse()## Rows: 344
## Columns: 8
## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <dbl> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g <dbl> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex <fct> male, female, female, NA, female, male, female, male…
## $ year <dbl> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
Se enumeran y describen breve-mente cada variable que forma parte del dataset:
Variables Numéricas:
Variables Categóricas:
A continuación veamos los posibles valores de las variables categóricas:
show_values(dataset %>% select_if(negate(is.numeric)))## _____________
## species n
## =============
## Adelie 152
## Chinstrap 68
## Gentoo 124
## ¯¯¯¯¯¯¯¯¯¯¯¯¯
## _____________
## island n
## =============
## Biscoe 168
## Dream 124
## Torgersen 52
## ¯¯¯¯¯¯¯¯¯¯¯¯¯
## __________
## sex n
## ==========
## female 165
## male 168
## NA 11
## ¯¯¯¯¯¯¯¯¯¯
missings_summary(dataset)hist_plots(dataset)Observaciones
box_plots(dataset)Observaciones
No se registran valores mas extremos que el mÃnimo y máximo valor en cada variables. Es decir que no encontramos outliers.
outliers(dataset, column='bill_length_mm')## $inf
## [1] 32.1
##
## $sup
## [1] 59.6
outliers(dataset, column='bill_length_mm')## $inf
## [1] 32.1
##
## $sup
## [1] 59.6
outliers(dataset, column='bill_depth_mm')## $inf
## [1] 13.1
##
## $sup
## [1] 21.5
outliers(dataset, column='flipper_length_mm')## $inf
## [1] 172
##
## $sup
## [1] 231
outliers(dataset, column='body_mass_g')## $inf
## [1] 2700
##
## $sup
## [1] 6300
outliers(dataset, column='year')## $inf
## [1] 2007
##
## $sup
## [1] 2009
bar_plots(dataset)Observaciones
dataset <- dataset %>% drop_na()
missings_summary(dataset)## Warning: attributes are not identical across measure variables;
## they will be dropped
corr_plot(dataset %>% dplyr::select(-year))segmented_pairs_plot(dataset, segment_column='species')train_test <- train_test_split(dataset, train_size = 0.7, shuffle = TRUE)## [1] "Train set size: 233"
## [1] "Test set size: 100"
train_set <- train_test[[1]]
test_set <- train_test[[2]]lineal_model_1 <- lm(
body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
data = train_set
)bayesion_model_1 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x1;
vector<lower=1>[obs_count] x2;
vector<lower=1>[obs_count] x3;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real beta2;
real beta3;
real<lower=0> sigma;
}
model {
beta0 ~ normal(0, 8000);
beta1 ~ normal(0, 100);
beta2 ~ normal(0, 100);
beta3 ~ normal(0, 100);
sigma ~ exponential(0.1);
y ~ normal(beta0 + beta1 * x1 + beta2 * x2 + beta3 * x3, sigma);
}
",
data = list(
obs_count = nrow(train_set),
y = colvalues(train_set, 'body_mass_g'),
x1 = colvalues(train_set, 'bill_length_mm'),
x2 = colvalues(train_set, 'bill_depth_mm'),
x3 = colvalues(train_set, 'flipper_length_mm')
),
chains = 3,
iter = 300,
warmup = 180,
thin = 1
)## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
params_1 <- c('beta0', 'beta1', 'beta2', 'beta3', 'sigma')
traceplot(bayesion_model_1, pars = params_1, inc_warmup = TRUE)lm_vs_br_coeficients(lineal_model_1, bayesion_model_1, params_1)vars_1 <- c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm')
lm_vs_br_models_validation(lineal_model_1, bayesion_model_1, params_1, vars_1, test_set)bayesion_predictor_1 <- BayesianRegressionPredictor.from(bayesion_model_1, params_1, vars_1)
plot_compare_fit(
lineal_model_1,
bayesion_predictor_1,
train_set,
label_1='Regresion Lineal',
label_2='Regresion Bayesiana'
)lineal_model_2 <- lm(
body_mass_g
~ bill_length_mm
+ bill_depth_mm
+ flipper_length_mm
+ sex,
data = train_set
)Antes que nada transformamos la columna categórica a one-hot encoding:
cat_cols <- c('sex')
train_set_cat <- train_set %>% dummify(cat_cols)
test_set_cat <- test_set %>% dummify(cat_cols)
train_set_catConstruimos una matriz con todas las variables(X) mas el intercept:
to_model_input <- function(df) {
model.matrix(
body_mass_g
~ bill_length_mm
+ bill_depth_mm
+ flipper_length_mm
+ sex_female
+ sex_male,
data = df
)
}
train_X <- train_set_cat %>% to_model_input()
test_X <- test_set_cat %>% to_model_input()Definimos y corremos el modelo bayesiano:
bayesion_model_2 <- stan(
model_code = "
data {
int<lower=1> obs_count;
int<lower=1> coef_count;
matrix[obs_count,coef_count] X;
vector[obs_count] y;
}
parameters {
vector[coef_count] beta;
real<lower=0> sigma;
}
model {
beta[1] ~ normal(0, 2000);
beta[2] ~ normal(0, 30);
beta[3] ~ normal(0, 100);
beta[4] ~ normal(0, 100);
beta[5] ~ normal(0, 100);
beta[6] ~ normal(0, 100);
sigma ~ exponential(0.1);
y ~ normal(X * beta, sigma);
}
",
data = list(
obs_count = dim(train_X)[1],
coef_count = dim(train_X)[2],
y = colvalues(train_set_cat, 'body_mass_g'),
X = train_X
),
chains = 3,
iter = 300,
warmup = 180,
thin = 1
)## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
params_2 <- c('beta[1]', 'beta[2]', 'beta[3]', 'beta[4]', 'beta[5]', 'beta[6]', 'sigma')
traceplot(bayesion_model_2, inc_warmup = TRUE, pars = params_2)lineal_model_2$coefficients## (Intercept) bill_length_mm bill_depth_mm flipper_length_mm
## -1922.6152066 -0.5330962 -92.9216684 37.2016333
## sexmale
## 534.1583252
br_coeficients(bayesion_model_2, params_2)vars_2 <- c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'sex_female','sex_male')
lm_vs_br_models_validation(
lineal_model_2,
bayesion_model_2,
params_2,
vars_2,
test_set,
test_set_cat
)bayesion_predictor_2 <- BayesianRegressionPredictor.from(bayesion_model_2, params_2, vars_2)
plot_compare_fit(
lineal_model_2,
bayesion_predictor_2,
test_set,
test_set_cat,
label_1='Regresion Lineal',
label_2='Regresion Bayesiana'
)A continuación vamos a agregar outliers a la variable flipper_length_mm, la cual define la longitud de la aleta del individuo medida en milÃmetros.
Para visualizar los nuevos outliers a continuación graficamos flipper_length_mm vs. body_mass_g antes y después de agregar outliers:
plot_data(train_set)train_set_with_outliers <- train_set %>%
mutate(flipper_length_mm = ifelse(
body_mass_g > 4900 & body_mass_g < 5000,
flipper_length_mm + (flipper_length_mm * runif(1, 0.1, 0.2)),
flipper_length_mm
))
plot_data(train_set_with_outliers)lineal_model_3 <- lm(
body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
data = train_set_with_outliers
)Comparemos el ajuste del modelo lineal ajustando en un dataset de train con y sin outliers:
plot_compare_fit(
lineal_model_1,
lineal_model_3,
train_set_with_outliers,
label_1='Regresión Lineal SIN outliers',
label_2='Regresión Lineal CON outliters'
)Se aprecia que el modelo entrenado en el train set outliers esta apalancado por las observaciones atipicas.
Entrenamos una regresión lineal múltiple robusta para intentar de disminuir el efecto de los nuevos outliers.
p_load(MASS)
robust_lineal_model_3 <- rlm(
body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
data = train_set_with_outliers
)plot_compare_fit(
lineal_model_1,
robust_lineal_model_3,
train_set_with_outliers,
label_1='Regresión Lineal SIN outliers',
label_2='Regresión Lineal Robusta CON outliters'
)Gráficamente no se llega a distinguir pero el modelo robusto termina ajustando mejor que el modelo lineal clásico. Esto se puede observar cuando comparamos el RMSE/MAE sobre train.
lm_vs_lm_models_validation(lineal_model_3, robust_lineal_model_3, train_set_with_outliers)## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
Mas alla de esto el modelo clasico sigue dando mejores resultado en test:
lm_vs_lm_models_validation(lineal_model_3, robust_lineal_model_3, test_set)bayesion_model_3 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x1;
vector<lower=1>[obs_count] x2;
vector<lower=1>[obs_count] x3;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real beta2;
real beta3;
real<lower=0> sigma;
}
model {
beta0 ~ normal(0, 8000);
beta1 ~ normal(0, 100);
beta2 ~ normal(0, 100);
beta3 ~ normal(0, 100);
sigma ~ exponential(0.1);
y ~ normal(beta0 + beta1 * x1 + beta2 * x2 + beta3 * x3, sigma);
}
",
data = list(
obs_count = nrow(train_set_with_outliers),
y = colvalues(train_set_with_outliers, 'body_mass_g'),
x1 = colvalues(train_set_with_outliers, 'bill_length_mm'),
x2 = colvalues(train_set_with_outliers, 'bill_depth_mm'),
x3 = colvalues(train_set_with_outliers, 'flipper_length_mm')
),
chains = 3,
iter = 300,
warmup = 180,
thin = 1
)
params_3 <- c('beta0', 'beta1', 'beta2', 'beta3', 'sigma')
traceplot(bayesion_model_3, pars = params_3, inc_warmup = TRUE)lm_vs_br_coeficients(robust_lineal_model_3, bayesion_model_3, params_3)vars_3 <- c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm')
lm_vs_br_models_validation(robust_lineal_model_3, bayesion_model_3, params_3, vars_3, test_set)bayesion_predictor_3 <- BayesianRegressionPredictor.from(bayesion_model_3, params_3, vars_3)
plot_compare_fit(
robust_lineal_model_3,
bayesion_predictor_3,
train_set,
label_1='Regresion Lineal Robusta CON outliers',
label_2='Regresion Bayesiana CON outliers'
)COMPLETAR
En este aso entrenamos solo con el 10% de lo datos.
train_test <- train_test_split(dataset, train_size = 0.05, shuffle = TRUE)## [1] "Train set size: 16"
## [1] "Test set size: 317"
train_set_4 <- train_test[[1]]
test_set_4 <- train_test[[2]]plot_data(train_set_4)lineal_model_4 <- lm(
body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
data = train_set_4
)bayesion_model_4 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x1;
vector<lower=1>[obs_count] x2;
vector<lower=1>[obs_count] x3;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real beta2;
real beta3;
real<lower=0> sigma;
}
model {
beta0 ~ normal(0, 8000);
beta1 ~ normal(0, 100);
beta2 ~ normal(0, 100);
beta3 ~ normal(0, 100);
sigma ~ exponential(0.1);
y ~ normal(beta0 + beta1 * x1 + beta2 * x2 + beta3 * x3, sigma);
}
",
data = list(
obs_count = nrow(train_set_4),
y = colvalues(train_set_4, 'body_mass_g'),
x1 = colvalues(train_set_4, 'bill_length_mm'),
x2 = colvalues(train_set_4, 'bill_depth_mm'),
x3 = colvalues(train_set_4, 'flipper_length_mm')
),
chains = 3,
iter = 300,
warmup = 180,
thin = 1
)
params_4 <- c('beta0', 'beta1', 'beta2', 'beta3', 'sigma')
traceplot(bayesion_model_4, pars = params_4, inc_warmup = TRUE)Coeficientes de la regresión múltiple:
lineal_model_4$coefficients## (Intercept) bill_length_mm bill_depth_mm flipper_length_mm
## -8926.26327 24.74177 81.05387 52.87000
Coeficientes descubiertos por la regresión múltiple bayesiana:
for(param in params_4) print(get_posterior_mean(bayesion_model_4, par=param)[4])## [1] -7895.822
## [1] 26.30672
## [1] 53.77526
## [1] 49.6672
## [1] 236.508
vars_4 <- c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm')
lm_vs_br_models_validation(lineal_model_4, bayesion_model_4, params_4, vars_4, test_set)bayesion_predictor_4 <- BayesianRegressionPredictor.from(bayesion_model_4, params_4, vars_4)
plot_compare_fit(
lineal_model_4,
bayesion_predictor_4,
train_set,
label_1='Regresion Lineal',
label_2='Regresion Bayesiana'
)Definimos una distribución uniforme para el beta asociado a la variable flipper_length_mm.
## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
br_vs_br_coeficients(bayesion_model_1, bayesion_model_5, params_5)lm_vs_br_models_validation(lineal_model_1, bayesion_model_1, params_1, vars_1, test_set)vars_5 <- c('bill_length_mm', 'bill_depth_mm', 'flipper_length_mm')
lm_vs_br_models_validation(lineal_model_1, bayesion_model_5, params_5, vars_5, test_set)bayesion_predictor_5 <- BayesianRegressionPredictor.from(bayesion_model_5, params_5, vars_5)
plot_compare_fit(
bayesion_predictor_1,
bayesion_predictor_5,
train_set,
label_1='Regresion Bayesiana con dist informativa',
label_2='Regresion Bayesiana con dist menos informativa'
)COMPLETAR